Prep data

Load necessary packages

setwd("~/Desktop/working-with-lyle/Formality_Project")#set our WD 
if (!require("pacman")) install.packages("pacman") #run this if you don't have pacman 
library(pacman)
pacman::p_load(tidyverse,rlang, zoo, lubridate, plotrix, ggpubr, caret, broom, kableExtra, reactable, install = T) 
#use pacman to load packages quickly 

Define Aesthetics for graphs and stuff

palette_map = c("#3B9AB2", "#EBCC2A", "#F21A00")
palette_condition = c("#ee9b00", "#bb3e03", "#005f73")

plot_aes = theme_classic() +
  theme(legend.position = "top",
        legend.text = element_text(size = 12),
        text = element_text(size = 16, family = "Futura Medium"),
        axis.text = element_text(color = "black"),
        axis.line = element_line(colour = "black"),
        axis.ticks.y = element_blank())

Define Table Functions

 table_model = function(model_data) {
   model_data %>% 
     tidy() %>% 
     rename("SE" = std.error,
            "t" = statistic,
            "p" = p.value) %>%
     kable() %>% 
     kableExtra::kable_styling()
 }

Load data and do a quick clean of missing data and prize winners

df <- read_csv('books_cleaned_LIWC.csv') #read in the data

Tidy the data

 tidy_df <- df %>%
   group_by(ORIG_PUBL_DATE) %>% ###grouping by the year 
   summarise_at(vars("Analytic","WPS","BigWords","Period"),  funs(mean, std.error),) #pulling the means and SEs for our variables of interest

# Get the mean values for the year 1933
year_means <- tidy_df %>%
  filter(ORIG_PUBL_DATE == 1933) 

#create centered variables on 1857
tidy_df$Analytic_centered <- tidy_df$Analytic_mean - 53.1
tidy_df$WPS_centered <- tidy_df$WPS_mean - 12.52
tidy_df$BigWords_centered <- tidy_df$BigWords_mean - 16.06
tidy_df$Period_centered <- tidy_df$Period_mean - 8.13

Corpus Summary Stats

Raw count of Books

df %>%
  select(Filename) %>%
  dplyr::summarize(n = n()) %>%
  reactable::reactable(striped = TRUE)

Number of Authors

df %>%
  select(AUTH_LAST) %>%
  dplyr::summarize(n = n()) %>%
  reactable::reactable(striped = TRUE)

Number of authors per year

auth_year <- df %>%
  select(ORIG_PUBL_DATE,AUTH_LAST) %>%
  unique() %>%
  group_by(ORIG_PUBL_DATE) %>%
  dplyr::summarize(n = n()) %>%
  reactable::reactable(striped = TRUE)
 auth_year

Sex Distribution

auth_sex <- df %>%
   select(AUTH_GENDER,Filename) %>%
   unique() %>%
   group_by(AUTH_GENDER) %>%
   dplyr::summarize(n = n()) %>%
   reactable::reactable(striped = TRUE)
 auth_sex

Word Count by Sex

 WC_sex <- df %>%
   select(AUTH_GENDER,WC) %>%
   unique() %>%
   group_by(AUTH_GENDER) %>%
   dplyr::summarize(mean = mean(WC)) %>%
   reactable::reactable(striped = TRUE)
 WC_sex

Part of Speech Graphs

Plot the Smoothed Data

#Plot our smoothed data 

#we are using Non-tidy data here to capture the individual variation 

#Analytic Thinking 

Analytic_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=Analytic, group=1)) +
  ggtitle("Analytic Thinking") +
  geom_point(color = "dodgerblue3", alpha = 0.2) + 
  geom_smooth(method = "loess", span = 0.50 )+ 
  plot_aes +
  labs(x = "Year", y = '% of Total Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=70,label="
             intercept = 
             estimate = -0.0484 
             p-value = 0.0785
           
           ", size = 3.5)

#Bigwords
Bw_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=BigWords, group=1)) +
  ggtitle("Big Words (Letters > 6)") +
  geom_point(color = "dodgerblue3", alpha = 0.2) + 
  geom_smooth(method = "loess", span = 0.50 )+ 
  plot_aes +
  labs(x = "Year", y = '% of Total Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=20,label="
             estimate = -0.0095
             p-value = 0.0236
           
           ", size = 3.5)

#Periods
period_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=Period, group=1)) +
  ggtitle("Period Usage") +
  geom_point(color = "dodgerblue3", alpha = 0.2) + 
  geom_smooth(method = "loess", span = 0.50 )+ 
  plot_aes +
  labs(x = "Year", y = '% of Total Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=11,label="
             estimate = 0.0167
             p-value < .001
           
           ", size = 3.5)

#words per sentence
wps_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=WPS, group=1)) +
  ggtitle("Words per Sentence") +
  geom_point(color = "dodgerblue3", alpha = 0.2) + 
  geom_smooth(method = "loess", span = 0.70 )+ 
  plot_aes +
  labs(x = "Year", y = '# of Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=25,label="
             estimate = -0.0348
             p-value < .001
           
           ", size = 3.5)


smooth_graphs <- ggpubr::ggarrange(Analytic_smooth,Bw_smooth,period_smooth,wps_smooth,
                                   ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(smooth_graphs,
                top = text_grob("Smooth Formality Graphs",  color = "black", face = "bold", size = 20),
                bottom = text_grob(
                "Note. Horizontal shading represents Standard Error."
                                   , color = "Black",
                                   hjust = 1.0, x = 1, face = "italic", size = 14))

Smoothed data by year

Analytic_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=Analytic_mean, group=1)) +
  ggtitle("Analytic Thinking") +
  geom_point(color = "dodgerblue3", alpha = 0.5) + 
  geom_smooth(method = "loess", span = 0.90 )+ 
  plot_aes +
  labs(x = "Year", y = '% of Total Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=60,label="
             estimate = -0.0404
             p-value =  0.1339
           
           ", size = 3.5)

#Bigwords
Bw_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=BigWords_mean, group=1)) +
  ggtitle("Big Words N > 6") +
  geom_point(color = "dodgerblue3", alpha = 0.5) + 
  geom_smooth(method = "loess", span = 0.60 )+ 
  plot_aes +
  labs(x = "Year", y = '% of Total Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=17,label="
             estimate = -0.0075
             p-value =  0.0999
           
           ", size = 3.5)

#Periods
period_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=Period_mean, group=1)) +
  ggtitle("Period Usage") +
  geom_point(color = "dodgerblue3", alpha = 0.5) + 
  geom_smooth(method = "loess", span = 0.60 )+ 
  plot_aes +
  labs(x = "Year", y = '% of Total Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=8,label="
             estimate = 0.0166
             p-value < .001
           
           ", size = 3.5)

#words per sentence
wps_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=WPS_mean, group=1)) +
  ggtitle("Words per Sentence") +
  geom_point(color = "dodgerblue3", alpha = 0.5) + 
  geom_smooth(method = "loess", span = 0.90 )+ 
  plot_aes +
  labs(x = "Year", y = '# of Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) +
  annotate(geom="text",x=1935,
             y=16,label="
             estimate = -0.0347
             p-value < .001
           
           ", size = 3.5)


tidy_smooth_graphs <- ggpubr::ggarrange(Analytic_smooth_tidy,Bw_smooth_tidy,
                                  period_smooth_tidy,wps_smooth_tidy,
                                   ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(tidy_smooth_graphs,
                top = text_grob("Smooth Formality Graphs (grouped by year)",  color = "black", face = "bold", size = 20),
                bottom = text_grob(
                "Note. Horizontal shading represents Standard Error. 
                Estimates show are from centered analyses (centered on 1933; first year in the dataset)."
                                   , color = "Black",
                                   hjust = 1.05, x = 1, face = "italic", size = 16))

Make our rough plots (means per year)

Plotting the data by year (one data point per year).

Analytic <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=Analytic_mean, group=1)) +
   geom_line(colour = "dodgerblue3") +
   geom_ribbon(aes(ymin=Analytic_mean-Analytic_std.error, ymax=Analytic_mean+Analytic_std.error), alpha=0.2) +
   ggtitle("Analytic Thinking") +
   plot_aes + 
   labs(x = "Year", y = 'Standardized score') + 
   theme(axis.text.x=element_text(angle=45, hjust=1), 
         plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
   theme(plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
   theme(axis.text=element_text(size=16),
         axis.title=element_text(size=20,face="bold"))+
   theme(plot.title.position = 'plot', 
         plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
   theme(axis.text=element_text(size = 14),
         axis.title=element_text(size = 20,face="bold")) 

#WPS 
WPS <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=WPS_mean, group=1)) +
  geom_line(colour = "dodgerblue3") +
  geom_ribbon(aes(ymin=WPS_mean-WPS_std.error, ymax=WPS_mean+WPS_std.error), alpha=0.2) +
  ggtitle("WPS") +
  plot_aes +
  labs(x = "Year", y = '# of Words') + 
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) 

#BigWords 
BigWords <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=BigWords_mean, group=1)) +
  geom_line(colour = "dodgerblue3") +
  geom_ribbon(aes(ymin=BigWords_mean-BigWords_std.error, ymax=BigWords_mean+BigWords_std.error), alpha=0.2) +
  ggtitle("Big Words N > 6") +
  plot_aes +
  labs(x = "Year", y = '% of Total Words') + 
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) 


#period frequency 
Period <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=Period_mean, group=1)) +
  geom_line(colour = "dodgerblue3") +
  geom_ribbon(aes(ymin=Period_mean-Period_std.error, ymax=Period_mean+Period_std.error), alpha=0.2) +
  ggtitle("Period-usage") +
  plot_aes +
  labs(x = "Year", y = '% of Total Words') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold")) 

#raw graphs
raw_graphs <- ggpubr::ggarrange(Analytic,BigWords,Period,WPS,ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(raw_graphs,
                top = text_grob("Raw Formality Graphs (grouped by year)",  color = "black", face = "bold", size = 20),
                bottom = text_grob("Note.Graphs are of books in the collection"
                                   , color = "Black",
                                   hjust = 1.7, x = 1, face = "italic", size = 16))

Regression Models

Models presented in order: Raw data, aggregated by year, centered on 1857

Analytic Thinking

#Raw Data
AT_RAW <- lm(Analytic ~ ORIG_PUBL_DATE, data = df)

#Tidy Data
AT_TIDY <- lm(Analytic_mean ~ ORIG_PUBL_DATE, data = tidy_df)

#centered
AT_centered <- lm(Analytic_centered ~ ORIG_PUBL_DATE, data = tidy_df)

table_model(AT_RAW)
term estimate SE t p
(Intercept) 147.5605 54.4342 2.711 0.0069
ORIG_PUBL_DATE -0.0484 0.0275 -1.763 0.0785
table_model(AT_TIDY)
term estimate SE t p
(Intercept) 131.7613 52.7234 2.499 0.0144
ORIG_PUBL_DATE -0.0404 0.0267 -1.513 0.1339
table_model(AT_centered)
term estimate SE t p
(Intercept) 78.6613 52.7234 1.492 0.1394
ORIG_PUBL_DATE -0.0404 0.0267 -1.513 0.1339

Big Words (words with a letter count > 6)

BW_Raw <- lm(BigWords ~ ORIG_PUBL_DATE, data = df)
BW_Tidy <- lm(BigWords_mean ~ ORIG_PUBL_DATE, data = tidy_df)
BW_centered <- lm(BigWords_centered ~ ORIG_PUBL_DATE, data = tidy_df)

table_model(BW_Raw)
term estimate SE t p
(Intercept) 34.9153 8.3032 4.205 0.0000
ORIG_PUBL_DATE -0.0095 0.0042 -2.270 0.0236
table_model(BW_Tidy)
term estimate SE t p
(Intercept) 30.9375 8.9125 3.471 0.0008
ORIG_PUBL_DATE -0.0075 0.0045 -1.663 0.0999
table_model(BW_centered)
term estimate SE t p
(Intercept) 14.8775 8.9125 1.669 0.0987
ORIG_PUBL_DATE -0.0075 0.0045 -1.663 0.0999

Periods

#Periods
Period_Raw <- lm(Period ~ ORIG_PUBL_DATE, data = df)
Period_Tidy <- lm(Period_mean ~ ORIG_PUBL_DATE, data = tidy_df)
Period_centered <- lm(Period_centered ~ ORIG_PUBL_DATE, data = tidy_df)
table_model(Period_Raw)
term estimate SE t p
(Intercept) -25.4425 5.0328 -5.055 0
ORIG_PUBL_DATE 0.0167 0.0025 6.570 0
table_model(Period_Tidy)
term estimate SE t p
(Intercept) -25.3173 4.4555 -5.682 0
ORIG_PUBL_DATE 0.0166 0.0023 7.377 0
table_model(Period_centered)
term estimate SE t p
(Intercept) -33.4473 4.4555 -7.507 0
ORIG_PUBL_DATE 0.0166 0.0023 7.377 0

Words per Sentence

#WPS
WPS_Raw <- lm(WPS ~ ORIG_PUBL_DATE, data = df)
WPS_Tidy <- lm(WPS_mean ~ ORIG_PUBL_DATE, data = tidy_df)
WPS_centered <- lm(WPS_centered ~ ORIG_PUBL_DATE, data = tidy_df)

table_model(WPS_Raw)
term estimate SE t p
(Intercept) 81.6724 9.6359 8.476 0
ORIG_PUBL_DATE -0.0348 0.0049 -7.159 0
table_model(WPS_Tidy)
term estimate SE t p
(Intercept) 81.4208 8.7646 9.290 0
ORIG_PUBL_DATE -0.0347 0.0044 -7.823 0
table_model(WPS_centered)
term estimate SE t p
(Intercept) 68.9008 8.7646 7.861 0
ORIG_PUBL_DATE -0.0347 0.0044 -7.823 0